# coding:utf8

'''
#全国企业信用信息公示系统（上海）
#维护肖迪
'''
# import pycurl
# import urllib
import re
# from utils import kill_captcha
# import StringIO
import random
from scpy.logger import get_logger

logger = get_logger(__file__)
# from bs4 import BeautifulSoup
import json
import table
import requests
import time

requests.packages.urllib3.disable_warnings()
s = requests.Session()
TIME_OUT = 30


def verify(name):
    # global detail_url
    head = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Encoding": "gzip, deflate, sdch",
            "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4",
            "Cache-Control": "max-age=0",
            "Connection": "keep-alive",
            "Host": "www.sgs.gov.cn",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36"}
    loop_num = 0
    while 1:
        try:
            verify_url = 'https://www.sgs.gov.cn/notice/captcha?preset=&ra=%d' % random.random()
            # code = curl('https://www.sgs.gov.cn/notice/search/ent_info_list',cookie=1)
            code1 = s.get('https://www.sgs.gov.cn/notice/search/ent_info_list', verify=False, headers=head, timeout=TIME_OUT).content
            # verify_url = "https://www.sgs.gov.cn/notice/captcha?preset=&ra=%d"%random.random()
            # verify_image = curl(verify_url)
            verify_image = s.get(verify_url, verify=False, timeout=TIME_OUT).content
            # verify = kill_captcha(verify_image,'sh','jpeg')
            # print code
            code = re.findall('code: "(.+?)"', code1)[0]
            print code
            data = {"searchType": 1, "captcha": 0, "session.token": code, "condition.keyword": name}
            search_url = 'https://www.sgs.gov.cn/notice/search/ent_info_list'
            # html = curl(search_url,data)
            html = s.post(search_url, data=data, headers=head, timeout=TIME_OUT).content
            print html
            try:
                detail_url = re.findall('<a href="(https://www\.sgs\.gov\.cn/notice/notice/view.+?)"', html)[0]
            except:
                return ''
            # detail_html = curl(detail_url)
            detail_html = s.get(detail_url, verify=False, headers=head, timeout=TIME_OUT).content

            # print detail_html
            return (detail_html, detail_url)

        except Exception, e:
            time.sleep(2)
            loop_num += 1
            if loop_num <= 5:
                logger.exception(e)
                continue
            else:
                break


def run(tuplecontent, **args):
    detail_html = tuplecontent[0]
    detail_url = tuplecontent[1]
    tables = re.findall('<table[\s\S]+?</table>', detail_html)
    for j in tables:
        word = re.findall('<th colspan="\d+?">(.+?)</th>', j)
        if '股东信息' in j:
            shareHolderList = table.index('股东信息', j)
            if shareHolderList:
                for i in shareHolderList:
                    share_url = re.findall('href="(.+?)"', i['shareHolderdetail'])
                    if share_url:
                        share_url = share_url[0]
                        # html = curl(share_url)
                        html = s.get(share_url, verify=False, timeout=TIME_OUT).content
                        html = html.replace(' ', '')
                        subConam = re.findall('invt\.subConAm="(.+?)"', html)[0]
                        conDate = ""
                        fundedRatio = ""
                        regCapCur = re.findall('invt\.conForm="(.+?)"', html)[0]
                        country = ""
                        i['shareHolderdetail'] = share_url
                        i['subConam'] = subConam
                        i['conDate'] = conDate
                        i['fundedRatio'] = fundedRatio
                        i['regCapCur'] = regCapCur
                        i['country'] = country
                    else:
                        i['shareHolderdetail'] = ''
                        i['subConam'] = ''
                        i['conDate'] = ''
                        i['fundedRatio'] = ''
                        i['regCapCur'] = ''
                        i['country'] = ''
        try:
            if '基本信息' == word[0]:
                basicList = table.index(word[0].replace(' ', ''), j)
            # if '股东信息' == word[0]:
            #    shareHolderList = table.index('股东信息',j)
            if '主要人员信息' == word[0]:
                personList = table.index(word[0].replace(' ', ''), j)
            if '变更信息' == word[0]:
                alterList = table.index(word[0].replace(' ', ''), j)
            if '分支机构信息' == word[0]:
                filiationList = table.index(word[0].replace(' ', ''), j)
            if '清算信息' == word[0]:
                liquidationList = table.index(word[0].replace(' ', ''), j)
            if '经营异常' == word[0] or '经营异常信息' == word[0]:
                abnormalOperation = table.index(word[0].replace(' ', ''), j)
        except:
            print word
            continue
    try:
        flags = basicList
    except:
        basicList = []
    try:
        flags = shareHolderList
    except:
        shareHolderList = []
    try:
        flags = personList
    except:
        personList = []
    try:
        flags = alterList
    except:
        alterList = []
    try:
        flags = filiationList
    except:
        filiationList = []
    try:
        flags = liquidationList
    except:
        liquidationList = []
    try:
        flags = abnormalOperation
    except:
        abnormalOperation = []

    punishBreakList = []
    punishedList = []
    alidebtList = []
    entinvItemList = []
    frinvList = []
    frPositionList = []
    caseInfoList = []
    sharesFrostList = []
    sharesImpawnList = []
    morDetailList = []
    morguaInfoList = []
    report_url = detail_url.replace('tab=01', 'tab=02')
    html = s.get(report_url, verify=False, timeout=TIME_OUT).content
    # report_url = re.findall('"(https://www.sgs\.gov\.cn/notice/notice/view_annual.+?)"',html)
    report_url = re.findall('"(https://www.sgs\.gov\.cn/notice/notice/view_annual.+?)" target="_blank">(\d+)', html)
    yearReportList = []
    yearList = []
    for i in report_url:
        # print i
        url_temp = i[0]
        year = i[1]
        html = s.get(url_temp, verify=False, timeout=TIME_OUT).content
        # print html
        table_list = re.findall('<table[\s\S]+?</table>', html)
        for j in table_list:
            if '企业基本信息' in j:
                report_basic = table.report_basic(j)
            if '网站或网店信息' in j:
                report_website = table.report_website(j)
            if '企业资产状况信息' in j:
                report_assetsInfo = table.report_assetsInfo(j)
            if '股东及出资信息' in j:
                report_investorInformations = table.report_investorInformations(j)
            if '股权变更信息' in j:
                report_equityChangeInformations = table.report_equityChangeInformations(j)
            if '修改记录' in j:
                report_changeRecords = table.report_changeRecords(j)
            try:
                print report_basic
            except:
                report_basic = []
            try:
                print report_website
            except:
                report_website = {}
            try:
                print report_basic
            except:
                report_basic = []
            try:
                print report_assetsInfo
            except:
                report_assetsInfo = []
            try:
                print report_investorInformations
            except:
                report_investorInformations = []
            try:
                print report_equityChangeInformations
            except:
                report_equityChangeInformations = []
            try:
                print report_changeRecords
            except:
                report_changeRecords = []
        dit1 = {"year": year, "baseInfo": report_basic, "website": report_website,
                "investorInformations": report_investorInformations, "assetsInfo": report_assetsInfo,
                "equityChangeInformations": report_equityChangeInformations, "changeRecords": report_changeRecords}
        dit2 = {"year": year, "html": html}
        yearList.append(dit2)
        yearReportList.append(dit1)
    alldata = {'province': 'sh', "abnormalOperation": abnormalOperation, "basicList": basicList,
               "shareHolderList": shareHolderList, "personList": personList, "punishBreakList": punishBreakList,
               "punishedList": punishedList, "alidebtList": alidebtList, "entinvItemList": entinvItemList,
               "frinvList": frinvList, "frPositionList": frPositionList, "alterList": alterList,
               "filiationList": filiationList, "caseInfoList": caseInfoList, "sharesFrostList": sharesFrostList,
               "sharesImpawnList": sharesImpawnList, "morDetailList": morDetailList, "morguaInfoList": morguaInfoList,
               "liquidationList": liquidationList, "yearReportList": yearReportList}
    if args.get('type') == 1:
        # print json.dumps(basicList,ensure_ascii=False,indent=4)
        html_source = {"province": "sh", "type": 0, "html": detail_html, "keyword": args.get('searchkey', "none"),
                       "companyName": basicList[0]['enterpriseName'], "yearList": yearList}
        companyUrl = {"province": "sh", "url": detail_url, "method": "get",
                      "companyName": basicList[0]['enterpriseName']}
        return (html_source, alldata, companyUrl)
    return alldata


def search(key):
    html = verify(key)
    if html:
        result = run(html)
        return result
    else:
        return {}


def search2(key):
    html = verify(key)
    if html:
        result = run(html, type=1, searchword=key)
        return result
    else:
        return {}


def search3(data):
    url = data.get('url')
    html = s.get(url, verify=False, timeout=TIME_OUT).content
    default = (html, url)
    key = data.get('companyName')
    result = run(default, type=1, searchword=key)
    return result


if __name__ == "__main__":
    # data = {
    #     "province": "sh",
    #     "url": "https://www.sgs.gov.cn/notice/notice/view?uuid=9DfasM8QpxmVxiLRB3D_alW74wQF1eZM&tab=01",
    #     "method": "get",
    #     "companyName": "上海福美来房地产经纪有限公司"
    # }
    # # for i in name_list:
    # #     res = json.dumps(search(i),ensure_ascii=False,indent=4)
    # #     print res
    # #     open('/Users/xiaodi/tempdir/sccess1.txt','a').write(res+'\n')
    # res = json.dumps(search2(u'上海福美来房地产经纪有限公司'), ensure_ascii=False, indent=4)
    # # res = json.dumps(search3(data),ensure_ascii=False,indent=4)
    # print res
    req_data = {"companyName" : "国宏消防工程集团有限公司第一分公司", "url" : "https://www.sgs.gov.cn/notice/notice/view?uuid=YueKBFLXXkwWION.WWuiB3WWaXbelK8p&tab=01", "province" : "sh", "data" : "/businessPublicity.jspx?id=3D3E618DFFB01D45365F1304D30EC967", "method" : "get" }
    print json.dumps(search3(req_data),ensure_ascii=False,indent=4)

